import numpy as np
import pandas as pd
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
pd.options.display.max_columns = None
pd.set_option('display.float_format', lambda x: '%.2f' % x)
df = pd.read_excel('Online Retail.xlsx')
df.head()
Price
print(df.UnitPrice.min())
print(df.UnitPrice.max())
LOL some of our prices are negative. Is this maybe supposed to be returns?
However, we also have negative quantities, and the negative quantities aren't associated with negative price values...
print(df.Quantity.min())
print(df.Quantity.max())
df[df['Quantity'] == df.Quantity.min()]
What was the item that cost ~40k? Does that price make sense?
df[df['UnitPrice'] == df.UnitPrice.max()]
Not sure what "Manual" means. Here again we see negative quantities. Maybe it makes sense if we look at it in the context of all their transactions?
df[df['CustomerID'] == 15098]
It looks like the negative quantities are returns of previous purchases. Maybe the negative prices we saw before are bad data?
How many customers do we have?
len(df.CustomerID.unique())
How many invoices do we have?
len(df.InvoiceNo.unique())
missing data
df.isna().sum()
What date range are we covering?
print(df.InvoiceDate.min())
print(df.InvoiceDate.max())
country = df.groupby('Country').agg({'InvoiceNo':'count'}).reset_index()
country.columns = ['country', 'number_invoices']
country.head()
COLORS = ['rgba(155, 69, 242, 1)', 'rgba(242, 69, 155,1)', 'rgba(69, 155, 242, 1)', 'rgba(69, 242, 69,1)',
'rgba(247, 133, 19, 1)', 'rgba(19, 247, 186, 1)', 'rgba(87, 19, 247, 1)', 'rgba(212, 19, 247, 1)',
'rgba(226, 244, 22, 1)', 'rgba(244, 195, 22, 1)', 'rgba(166,252,229,1)', 'rgba(234,227,252,1)',
'rgba(246,209,252,1)', 'rgba(244,251,168,1)', 'rgba(251,233,168,1)', 'rgba(218,186,250,1)'
'rgba(248,169,209,1)', 'rgba(170,209,248,1)', 'rgba(185,250,187,1)', 'rgba(252,209,169,1)'
'rgba(214,62,79,1)', 'rgba(245,108,67,1)', 'rgba(84,37,132,1)', 'rgba(132,37,85,1)'
'rgba(44,99,154,1)', 'rgba(3,101,94,1)', 'rgba(158,1,66,1)', 'rgba(1,60,49,1)'
'rgba(162,162,162,1)', 'rgba(88,6,51,1)', 'rgba(53,151,142,1)', 'rgba(70,70,70,1)',
'rgba(138,74,108,1)']
fig = {
'data': [
{
'values': list(country['number_invoices']),
'labels': list(country['country']),
'textposition':'inside',
'hole': .4,
'type': 'pie',
'marker': {'colors': COLORS[0:len(list(country['country']))]}
}],
'layout': {
'title': 'Invoices by country'
}
}
iplot(fig)
data = [go.Histogram(x=df.Quantity, marker=dict(color='#EB89B5'))]
layout = go.Layout(
title='Quantity Histogram'
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
zoom = df[df['Quantity'] < 200]
zoom = zoom[zoom['Quantity'] > 0]
data = [go.Histogram(x=zoom.Quantity, marker=dict(color='#EB89B5'), nbinsx = 50)]
layout = go.Layout(
title='Zoomed in Quantity Histogram'
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
data = [go.Histogram(x=df.UnitPrice, marker=dict(color=COLORS[3]))]
layout = go.Layout(
title='Unit Price Histogram'
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
zoom = df[df['UnitPrice'] < 100]
zoom = zoom[zoom['UnitPrice'] > 0]
data = [go.Histogram(x=zoom.UnitPrice, marker=dict(color=COLORS[3]), nbinsx = 50)]
layout = go.Layout(
title='Zoomed In Unit Price Histogram'
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
# number of unique items purchased per customer
unique = df[['Description', 'CustomerID']]
unique = unique.drop_duplicates()
unique_items_per_customer = unique.groupby('CustomerID').agg({'Description': 'count'}).reset_index()
unique_items_per_customer.columns = ['customer_id', 'unique_items']
unique_items_per_customer.head()
data = [go.Histogram(x=unique_items_per_customer.unique_items, marker=dict(color=COLORS[4]))]
layout = go.Layout(
title='Number of Unique Items Purchased Per Customer'
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)